import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import plotly.subplots as sp
import plotly.figure_factory as ff
from itertools import cycle
import warnings
import re
import scipy
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
credits = pd.read_csv('Dataset\credits.csv')
credits.head()
| person_id | id | name | character | role | |
|---|---|---|---|---|---|
| 0 | 59401 | ts20945 | Joe Besser | Joe | ACTOR |
| 1 | 31460 | ts20945 | Moe Howard | Moe | ACTOR |
| 2 | 31461 | ts20945 | Larry Fine | Larry | ACTOR |
| 3 | 21174 | tm19248 | Buster Keaton | Johnny Gray | ACTOR |
| 4 | 28713 | tm19248 | Marion Mack | Annabelle Lee | ACTOR |
credits.shape
(124235, 5)
credits.columns
Index(['person_id', 'id', 'name', 'character', 'role'], dtype='object')
After reading the credits.csv dataset it is found that the dataset contains 5 columns and 120000+ rows. The column names are as follows:
Upon analysing the columns it is evident that this dataset contains data about the actors and what character they played.
Now, the dataset is to be analysed for null values.
Null value analysis is important because the null values can lead to erroneous and misleading analysis of the dataset.
credits.isnull().sum().sort_values(ascending=False)
character 16287 person_id 0 id 0 name 0 role 0 dtype: int64
round(100*(credits.isnull().sum()/len(credits.index)),2).sort_values(ascending=False)
character 13.11 person_id 0.00 id 0.00 name 0.00 role 0.00 dtype: float64
This shows us that the 'character' feature in the dataset has around 13% null values. This is fairly low value and can be handled without dropping the column.
credits['character'].replace(np.nan, "No value", inplace=True)
credits.head()
| person_id | id | name | character | role | |
|---|---|---|---|---|---|
| 0 | 59401 | ts20945 | Joe Besser | Joe | ACTOR |
| 1 | 31460 | ts20945 | Moe Howard | Moe | ACTOR |
| 2 | 31461 | ts20945 | Larry Fine | Larry | ACTOR |
| 3 | 21174 | tm19248 | Buster Keaton | Johnny Gray | ACTOR |
| 4 | 28713 | tm19248 | Marion Mack | Annabelle Lee | ACTOR |
Now the null values have been handled.
title = pd.read_csv("Dataset/titles.csv")
title.head()
| id | title | type | description | release_year | age_certification | runtime | genres | production_countries | seasons | imdb_id | imdb_score | imdb_votes | tmdb_popularity | tmdb_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ts20945 | The Three Stooges | SHOW | The Three Stooges were an American vaudeville ... | 1934 | TV-PG | 19 | ['comedy', 'family', 'animation', 'action', 'f... | ['US'] | 26.0 | tt0850645 | 8.6 | 1092.0 | 15.424 | 7.6 |
| 1 | tm19248 | The General | MOVIE | During America’s Civil War, Union spies steal ... | 1926 | NaN | 78 | ['action', 'drama', 'war', 'western', 'comedy'... | ['US'] | NaN | tt0017925 | 8.2 | 89766.0 | 8.647 | 8.0 |
| 2 | tm82253 | The Best Years of Our Lives | MOVIE | It's the hope that sustains the spirit of ever... | 1946 | NaN | 171 | ['romance', 'war', 'drama'] | ['US'] | NaN | tt0036868 | 8.1 | 63026.0 | 8.435 | 7.8 |
| 3 | tm83884 | His Girl Friday | MOVIE | Hildy, the journalist former wife of newspaper... | 1940 | NaN | 92 | ['comedy', 'drama', 'romance'] | ['US'] | NaN | tt0032599 | 7.8 | 57835.0 | 11.270 | 7.4 |
| 4 | tm56584 | In a Lonely Place | MOVIE | An aspiring actress begins to suspect that her... | 1950 | NaN | 94 | ['thriller', 'drama', 'romance'] | ['US'] | NaN | tt0042593 | 7.9 | 30924.0 | 8.273 | 7.6 |
title.shape
(9871, 15)
title.columns
Index(['id', 'title', 'type', 'description', 'release_year',
'age_certification', 'runtime', 'genres', 'production_countries',
'seasons', 'imdb_id', 'imdb_score', 'imdb_votes', 'tmdb_popularity',
'tmdb_score'],
dtype='object')
After reading the titles.csv dataset it is found that the dataset contains 15 columns and 9800+ rows. The column names are as follows:
Upon analysing the columns it is evident that this dataset contains data about various movies and web series available on Amazon Prime along with data about their release year, genres, production countries, type, runtime etc., along with their IMDB(Internet Movie Database) and TMDB(The Movie Database) scores and popularity.
Now, the dataset is to be analysed for null values.
Null value analysis is important because the null values can lead to erroneous and misleading analysis of the dataset.
title.isnull().sum().sort_values(ascending=False)
seasons 8514 age_certification 6487 tmdb_score 2082 imdb_votes 1031 imdb_score 1021 imdb_id 667 tmdb_popularity 547 description 119 id 0 title 0 type 0 release_year 0 runtime 0 genres 0 production_countries 0 dtype: int64
round(100*(title.isnull().sum()/len(title.index)),2).sort_values(ascending=False)
seasons 86.25 age_certification 65.72 tmdb_score 21.09 imdb_votes 10.44 imdb_score 10.34 imdb_id 6.76 tmdb_popularity 5.54 description 1.21 id 0.00 title 0.00 type 0.00 release_year 0.00 runtime 0.00 genres 0.00 production_countries 0.00 dtype: float64
This analysis show us that the 'seasons' and 'age_certification' features in the dataset has more than 50% null values. So it would be better to drop these features. Dropping these feature is a better option because the missing data is large, and it can't be handled just by replacing the null values.
title = title.drop(columns=['seasons','age_certification'])
round(100*(title.isnull().sum()/len(title.index)),2).sort_values(ascending=False)
tmdb_score 21.09 imdb_votes 10.44 imdb_score 10.34 imdb_id 6.76 tmdb_popularity 5.54 description 1.21 id 0.00 title 0.00 type 0.00 release_year 0.00 runtime 0.00 genres 0.00 production_countries 0.00 dtype: float64
The columns having fairly less null values also need to be handled. There can be two types of values for which we need to handle the null data:
-> To handle the values that can repeat we can replace the null values with the mode value(the value that most frequently occurs in that column)
-> To handle the values that can only be unique, we have no choice rather than replacing the null values with 'No Data'
title['tmdb_score'] = title['tmdb_score'].fillna(title['tmdb_score'].mode()[0])
title['imdb_votes'] = title['imdb_votes'].fillna(title['imdb_score'].mode()[0])
title['imdb_score'] = title['imdb_score'].fillna(title['imdb_score'].mode()[0])
title['tmdb_popularity'] = title['tmdb_popularity'].fillna(title['tmdb_popularity'].mode()[0])
title['imdb_id'] = title['imdb_id'].fillna('No Data')
title['description'].replace(np.nan,'No Data',inplace=True)
title['imdb_id'].replace(np.nan,'No Data',inplace=True)
The dataset has been prepared for EDA now. All the null values have been properly handled.
fig = sp.make_subplots(
rows = 5,
cols = 1,
subplot_titles=['Release Year',
'Runtime',
'IMDB Votes',
'IMDB Rating',
'Movie vs Show'],
specs = [[{'type':'histogram'}],
[{'type':'histogram'}],
[{'type':'histogram'}],
[{'type':'histogram'}],
[{'type':'pie'}]]
)
release_year = go.Histogram(
x=title.release_year,
name = 'Release Year',
legendgroup='Release Year',
textfont=dict(color='white')
)
runtime = go.Histogram(
x=title.runtime,
name = 'Runtime',
legendgroup='Runtime',
textfont=dict(color='white')
)
imdb_votes = go.Histogram(
x=title.imdb_votes,
name = 'IMDB Votes',
legendgroup='IMDB Votes',
textfont=dict(color='white')
)
imdb_rating = go.Histogram(
x=title.imdb_score,
name = 'IMDB Rating',
legendgroup='IMDB Rating',
textfont=dict(color='white')
)
type_counts = title.type.value_counts().to_dict()
movie_show = go.Pie(
labels=list(type_counts.keys()),
values=list(type_counts.values()),
name='Types',
hoverinfo='label+value+percent',
legendgroup='Types'
)
fig.add_trace(release_year, row=1,col=1)
fig.update_xaxes(title=dict(text='Release Year', font=dict(color='white')),tickfont=dict(color='white'),row=1,col=1)
fig.update_yaxes(title=dict(text='Count', font=dict(color='white')),tickfont=dict(color='white'),row=1,col=1)
fig.add_trace(runtime,row=2,col=1)
fig.update_xaxes(title=dict(text='Runtime', font=dict(color='white')),tickfont=dict(color='white'),row=2,col=1)
fig.update_yaxes(title=dict(text='Count', font=dict(color='white')),tickfont=dict(color='white'),row=2,col=1)
fig.add_trace(imdb_votes,row=3,col=1)
fig.update_xaxes(title=dict(text='IMDB Votes', font=dict(color='white')),tickfont=dict(color='white'),row=3,col=1)
fig.update_yaxes(title=dict(text='Count', font=dict(color='white')),tickfont=dict(color='white'),row=3,col=1)
fig.add_trace(imdb_rating,row=4,col=1)
fig.update_xaxes(title=dict(text='IMDB Rating', font=dict(color='white')),tickfont=dict(color='white'),row=4,col=1)
fig.update_yaxes(title=dict(text='Count', font=dict(color='white')),tickfont=dict(color='white'),row=4,col=1)
fig.add_trace(movie_show,row=5,col=1)
fig.update_layout(
template='plotly',
height=2000
)
fig.update(
layout_title_text="Characteristics of Data",
layout_title_font_size = 20,
layout_title_font_color = 'white',
layout_title_x=0.5,
layout_paper_bgcolor='rgb(20,20,20)',
layout_plot_bgcolor='rgb(20,20,20)',
layout_font=dict(color="white")
)
fig.show()
title["genres"] = title["genres"].apply(lambda x: re.findall("\w+", x))
genres = list(title["genres"].values)
genres = list(set([item for sublist in genres for item in sublist]))
for i, genre in enumerate(genres):
title[genre] = title.genres.apply(lambda x: 1 if genre in x else 0).astype(int)
print("Number of genres: ",len(genres))
print("Genres: ",genres)
Number of genres: 19 Genres: ['european', 'reality', 'romance', 'comedy', 'fantasy', 'war', 'action', 'scifi', 'sport', 'western', 'horror', 'thriller', 'history', 'music', 'crime', 'documentation', 'animation', 'family', 'drama']
movie_dict = {}
for genre in genres:
movie_dict[genre] = title.query("type == 'MOVIE'")[genre].sum()
movie_dict = dict(sorted(movie_dict.items(), key=lambda x: x[0]))
show_dict = {}
for genre in genres:
show_dict[genre] = title.query("type == 'SHOW'")[genre].sum()
show_dict = dict(sorted(show_dict.items(), key=lambda x: x[0]))
fig = sp.make_subplots(
rows = 2,
cols = 1,
subplot_titles=['Movies','Shows'],
specs = [[{'type':'Bar'}],
[{'type':'Bar'}]]
)
movie_count = go.Bar(
x=list(movie_dict.keys()),
y=list(movie_dict.values()),
name='Movies'
)
show_count = go.Bar(
x=list(show_dict.keys()),
y=list(show_dict.values()),
name='Shows'
)
fig.add_trace(movie_count,row=1,col=1)
fig.update_xaxes(title_text="Genres",row=1,col=1)
fig.update_yaxes(title_text="Genres",row=1,col=1)
fig.add_trace(show_count,row=2,col=1)
fig.update_xaxes(title_text="Genres",row=2,col=1)
fig.update_yaxes(title_text="Genres",row=2,col=1)
fig.update(
layout_title_text="Genre Distribution based on number of Movies and Shows",
layout_title_font_size = 20,
layout_title_x=0.5,
layout_height = 1000,
layout_paper_bgcolor='rgb(20,20,20)',
layout_plot_bgcolor='rgb(20,20,20)',
layout_font=dict(color="white"),
layout_title_font_color = 'white'
)
fig.show()
movie_popularity_dict = dict()
for i, genre in enumerate(genres):
movie_popularity_dict[genre] = title.query("type=='MOVIE'").groupby(genre)["imdb_votes"].sum().sort_index().__getitem__(1)
movie_popularity_dict = dict(sorted(movie_popularity_dict.items(),key=lambda x: x[0]))
show_popularity_dict = dict()
for i, genre in enumerate(genres):
show_popularity_dict[genre] = title.query("type=='SHOW'").groupby(genre)['imdb_votes'].sum().sort_index().__getitem__(1)
show_popularity_dict = dict(sorted(show_popularity_dict.items(), key=lambda x: x[0]))
fig = sp.make_subplots(
rows= 2,
cols= 1,
subplot_titles=['Movies','Shows'],
specs= [[{'type':'Bar'}],
[{'type':'Bar'}]]
)
movie_popularity = go.Bar(
x=list(movie_popularity_dict.keys()),
y=list(movie_popularity_dict.values()),
name='Movie Popularity',
hoverinfo ='x+y'
)
show_popularity = go.Bar(
x=list(show_popularity_dict.keys()),
y=list(show_popularity_dict.values()),
name='Show Popularity',
hoverinfo='x+y'
)
fig.add_trace(movie_popularity,row=1,col=1)
fig.update_xaxes(title_text='Genres',row=1,col=1)
fig.update_yaxes(title_text='IMDB Votes',row=1,col=1)
fig.add_trace(show_popularity,row=2,col=1)
fig.update_xaxes(title_text='Genres',row=2,col=1)
fig.update_yaxes(title_text='IMDB Votes',row=2,col=1)
fig.update(
layout_title_text="Genre Distribution based on IMDB Votes",
layout_title_font_size = 20,
layout_title_x=0.5,
layout_height = 1000,
layout_paper_bgcolor='rgb(20,20,20)',
layout_plot_bgcolor='rgb(20,20,20)',
layout_font=dict(color="white"),
layout_title_font_color = 'white'
)
fig.show()
fig = ff.create_distplot(
[title[(title[genre] == 1) & (title['imdb_score'].notna())]['imdb_score'] for genre in sorted(genres)],
sorted(genres),
show_hist=False,
show_rug=False,
)
fig.update_layout(
title="IMDB Score Distribution by Genre",
title_font_size = 20,
title_x=0.5,
xaxis_title='IMDB Score',
paper_bgcolor='rgb(20,20,20)',
plot_bgcolor='rgb(20,20,20)',
legend_title='Genre',
font=dict(color="white"),
title_font_color = 'white'
)
fig.show()
This concludes the Exploratory Data Analysis of Amazon Prime Dataset